library(haven)
library(foreign)
library(readxl)
library(openxlsx)
library(car)
library(readstata13)
library(ggplot2)
library(dplyr)


########## Import survey data ##########
d <- read.dta13("~/PhD/Survey Data for Opinion_Policy Dataset/VALGUNDERSOKELSENE/Valgundersokelsen 1972/Folkeavstemningen om EF 1972 og Stortingsvalget 1973.dta")


######################################################################
#--------------------------  VARIABLES ------------------------------# 
######################################################################

# INSERT row number of issue in excell docuemnt (next number after previous entry)
issue_row_num = 1

# INSERT a policy item variable (Values = FAV, OPP, DK) and the name of that variable 
# Recode according to rules specified in separate document
d$polpref <- car::recode(d$NAME, " ' '='FAV'; ' '='OPP'; ' '='DK'")
polpref_var_name <- "NAME"

# INSERT Gender variable (1=MALE, 2=FEMALE)
d$gender <- car::recode(d$v317, "'Menn'='MALE'; 'Kvinner'='FEMALE'; else=NA")
table(d$gender)

# INSERT Age group variable (categorical, X number of groups, ordered youngest to oldest (1,2,...X), watch out for DK)
#d$age <- factor(d$alder, labels=c("1","2","3","4","5","6","7","8","9","10","11","12"))
#table(d$age)
# OR 
# INSERT Numeric AGE variable. Turns it into 10 equal-N categories
d$age <- d$v318
d %>% 
  group_by(age = ntile(age, 10)) %>% # split variable into k groups
  mutate(age_cats = paste0(min(age), "-", max(age))) %>% # create the ranges
  ungroup() -> d
table(d$age)

# INSERT Eduation group variable (categorical, X number of groups, ordered lowest to highest (1,2,...X), watch out for DK)
d$edu <- factor(d$v323, labels=c("1","2","3","4"))
table(d$edu)

# HOUSEHOLD income 
d$hincome <- d$v183
d$hincome <- factor(d$hincome, labels=c("1","2","3","4","5","6","7"))
table(d$hincome)

# INSERT Region variable
# d$region <- d$v475
# INSERT Fylke variable, NB: HEDMARK, ?ST-AGDER
#d$fylke <- car::recode(d$v351, "'Aust-agder'='?st-Agder'; 'Hedemark'='Hedmark'; 'Sogn og fjordane'='Sogn og Fjordane'; 'Nord-tr?ndelag'='Nord-Tr?ndelag'; 'Vest-agder'='Vest-Agder'; 'M?re og romsdal'='M?re og Romsdal';")  
#table(d$fylke)

# INSERT Party variable (What will they vote if election tomorrow) (Cateogrical, no order, but use same abbreviatons as in master data)
d$party <- ifelse(d$v257=="Nei", "NOVOTE", d$v258)
d$party <- car::recode(d$party, "17='RV'; 13='SV'; 1='AP'; 2='DNF'; 6='V'; 4='KRF'; 5='SP'; 3='H'; 14='ALP'; 'NOVOTE'='NOVOTE'; else=NA")
table(d$party)

######################################################################
#---------------------------- START ---------------------------------# 
######################################################################

# Age distribution 
age_dist <- as.data.frame(table(d$age, d$polpref))
age_dist$name <- paste("AGE", age_dist$Var1, sep ="")
age_dist$name <- paste(age_dist$name, age_dist$Var2, sep = "_")
age_dist <- age_dist %>% select(name, Freq)

# Education distribution
edu_dist <- as.data.frame(table(d$edu, d$polpref))
edu_dist$name <- paste("EDU", edu_dist$Var1, sep ="")
edu_dist$name <- paste(edu_dist$name, edu_dist$Var2, sep = "_")
edu_dist <- edu_dist %>% select(name, Freq)

# Income HOUSHOLD distribution
hinc_dist <- as.data.frame(table(d$hincome, d$polpref))
hinc_dist$name <- paste("HINC", hinc_dist$Var1, sep ="")
hinc_dist$name <- paste(hinc_dist$name, hinc_dist$Var2, sep = "_")
hinc_dist <- hinc_dist %>% select(name, Freq)

# Party distribution
p_dist <- as.data.frame(table(d$party, d$polpref))
p_dist$name <- paste("P", p_dist$Var1, p_dist$Var2, sep ="_")
p_dist <- p_dist %>% select(name, Freq)

# Region distribution
#rg_dist <- as.data.frame(table(d$region, d$polpref))
#rg_dist$name <- paste("RG", rg_dist$Var1, rg_dist$Var2, sep ="_")
#rg_dist <- rg_dist %>% select(name, Freq)

# Fylke distribution
#fy_dist <- as.data.frame(table(d$fylke, d$polpref))
#fy_dist$name <- paste("FY", fy_dist$Var1, fy_dist$Var2, sep ="_")
#fy_dist <- fy_dist %>% select(name, Freq)

# Gender
g_dist <- as.data.frame(table(d$gender, d$polpref))
g_dist$name <- paste(g_dist$Var1, g_dist$Var2, sep ="_")
g_dist <- g_dist %>% select(name, Freq)

# Overall
o_dist <- as.data.frame(table(d$polpref))
o_dist$name <- paste("OVERALL", o_dist$Var1, sep ="_")
o_dist <- o_dist %>% select(name, Freq)

###### Single variable distributions: 
# Single Age distribution 
age_dist_s <- as.data.frame(table(d$age))
age_dist_s$name <- paste("S_AGE", age_dist_s$Var1, sep ="")
age_dist_s <- age_dist_s %>% select(name, Freq)

# Single Education distribution
edu_dist_s <- as.data.frame(table(d$edu))
edu_dist_s$name <- paste("S_EDU", edu_dist_s$Var1, sep ="")
edu_dist_s <- edu_dist_s %>% select(name, Freq)

# Single Income HOUSHOLD distribution
hinc_dist_s <- as.data.frame(table(d$hincome))
hinc_dist_s$name <- paste("S_HINC", hinc_dist_s$Var1, sep ="")
hinc_dist_s <- hinc_dist_s %>% select(name, Freq)

# Rbind the data frames
m1 <- rbind(age_dist, edu_dist, hinc_dist, p_dist, age_dist_s, edu_dist_s, hinc_dist_s, o_dist, g_dist) 
m2 <- data.frame(t(m1))
colnames(m2) <- as.character(unlist(m2[1,]))
m2 = m2[-1, ]
m2$ISSUE <- issue_row_num
m2 <- mutate_all(m2, function(x) as.numeric(as.character(x)))

m2$ORG_VARNAME <- polpref_var_name
m2$ORG_DATASET_N <- nrow(d)

# Rbind with master data 
master_data <- read_excel("~/PhD/Dataset/PhD Dataset okt 3.xlsx")
rbind.all.columns <- function(x, y) {
  x.diff <- setdiff(colnames(x), colnames(y))
  y.diff <- setdiff(colnames(y), colnames(x))
  
  x[, c(as.character(y.diff))] <- NA
  
  y[, c(as.character(x.diff))] <- NA
  
  return(rbind(x, y))
}
master_data <- rbind.all.columns(master_data, m2)
master_data <- master_data %>% arrange(ISSUE)
write.xlsx(master_data, '~/PhD/Dataset/PhD Dataset okt 3.xlsx')

table(d$polpref,d$fylke)
table(d$polpref,d$age)
table(d$polpref,d$gender)
table(d$polpref,d$hincome)
table(d$polpref,d$edu)

######################################################################
#----------------------------- END ----------------------------------# 
######################################################################